EDA¶
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor": "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
#import the data from a csv-file
sales_df = pd.read_csv('data/eda_house_sales.csv')
sales_df
details_df = pd.read_csv('data/eda_house_details.csv')
details_df
| id | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000102 | 6.000 | 3.000 | 2400.000 | 9373.000 | 2.000 | NaN | 0.000 | 3 | 7 | 2400.000 | 0.000 | 1991 | 0.000 | 98002 | 47.326 | -122.214 | 2060.000 | 7316.000 |
| 1 | 100100050 | 3.000 | 1.000 | 1320.000 | 11090.000 | 1.000 | 0.000 | 0.000 | 3 | 7 | 1320.000 | 0.000 | 1955 | 0.000 | 98155 | 47.775 | -122.304 | 1320.000 | 8319.000 |
| 2 | 1001200035 | 3.000 | 1.000 | 1350.000 | 7973.000 | 1.500 | NaN | 0.000 | 3 | 7 | 1350.000 | 0.000 | 1954 | 0.000 | 98188 | 47.432 | -122.292 | 1310.000 | 7491.000 |
| 3 | 1001200050 | 4.000 | 1.500 | 1260.000 | 7248.000 | 1.500 | NaN | 0.000 | 5 | 7 | 1260.000 | 0.000 | 1955 | NaN | 98188 | 47.433 | -122.292 | 1300.000 | 7732.000 |
| 4 | 1003000175 | 3.000 | 1.000 | 980.000 | 7606.000 | 1.000 | 0.000 | 0.000 | 3 | 7 | 980.000 | 0.000 | 1954 | 0.000 | 98188 | 47.436 | -122.290 | 980.000 | 8125.000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21415 | 993002177 | 3.000 | 2.500 | 1380.000 | 1547.000 | 3.000 | 0.000 | 0.000 | 3 | 8 | 1380.000 | 0.000 | 2000 | NaN | 98103 | 47.691 | -122.341 | 1380.000 | 1465.000 |
| 21416 | 993002225 | 3.000 | 2.250 | 1520.000 | 1245.000 | 3.000 | NaN | 0.000 | 3 | 8 | 1520.000 | 0.000 | 2004 | 0.000 | 98103 | 47.691 | -122.340 | 1520.000 | 1470.000 |
| 21417 | 993002247 | 3.000 | 2.250 | 1550.000 | 1469.000 | 3.000 | 0.000 | 0.000 | 3 | 8 | 1550.000 | 0.000 | 2004 | 0.000 | 98103 | 47.691 | -122.341 | 1520.000 | 1465.000 |
| 21418 | 993002325 | 2.000 | 1.500 | 950.000 | 4625.000 | 1.000 | 0.000 | 0.000 | 4 | 7 | 950.000 | 0.000 | 1949 | NaN | 98103 | 47.691 | -122.340 | 1440.000 | 4625.000 |
| 21419 | 999000215 | 4.000 | 2.500 | 2760.000 | 5000.000 | 1.500 | 0.000 | 0.000 | 5 | 7 | 1680.000 | 1080.000 | 1928 | 0.000 | 98107 | 47.673 | -122.371 | 1850.000 | 5000.000 |
21420 rows × 19 columns
# Merge the datasets using a left join to retain all house details
merged_df = pd.merge(details_df, sales_df, left_on="id", right_on="house_id", how="outer", suffixes=('_details','_sales'))
# Display the first few rows to verify
merged_df.head(10)
| id_details | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | ... | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | date | price | house_id | id_sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000102 | 6.000 | 3.000 | 2400.000 | 9373.000 | 2.000 | NaN | 0.000 | 3 | 7 | ... | 0.000 | 98002 | 47.326 | -122.214 | 2060.000 | 7316.000 | 2014-09-16 | 280000.000 | 1000102 | 2495 |
| 1 | 1000102 | 6.000 | 3.000 | 2400.000 | 9373.000 | 2.000 | NaN | 0.000 | 3 | 7 | ... | 0.000 | 98002 | 47.326 | -122.214 | 2060.000 | 7316.000 | 2015-04-22 | 300000.000 | 1000102 | 2496 |
| 2 | 1200019 | 4.000 | 1.750 | 2060.000 | 26036.000 | 1.000 | NaN | 0.000 | 4 | 8 | ... | 0.000 | 98166 | 47.444 | -122.351 | 2590.000 | 21891.000 | 2014-05-08 | 647500.000 | 1200019 | 6730 |
| 3 | 1200021 | 3.000 | 1.000 | 1460.000 | 43000.000 | 1.000 | 0.000 | 0.000 | 3 | 7 | ... | 0.000 | 98166 | 47.443 | -122.347 | 2250.000 | 20023.000 | 2014-08-11 | 400000.000 | 1200021 | 8405 |
| 4 | 2800031 | 3.000 | 1.000 | 1430.000 | 7599.000 | 1.500 | 0.000 | 0.000 | 4 | 6 | ... | 0.000 | 98168 | 47.478 | -122.265 | 1290.000 | 10320.000 | 2015-04-01 | 235000.000 | 2800031 | 8801 |
| 5 | 3600057 | 4.000 | 2.000 | 1650.000 | 3504.000 | 1.000 | 0.000 | 0.000 | 3 | 7 | ... | 20130.000 | 98144 | 47.580 | -122.294 | 1480.000 | 3504.000 | 2015-03-19 | 402500.000 | 3600057 | 3554 |
| 6 | 3600072 | 4.000 | 2.750 | 2220.000 | 5310.000 | 1.000 | NaN | 0.000 | 5 | 7 | ... | NaN | 98144 | 47.580 | -122.294 | 1540.000 | 4200.000 | 2015-03-30 | 680000.000 | 3600072 | 18507 |
| 7 | 3800008 | 5.000 | 1.500 | 1990.000 | 18200.000 | 1.000 | NaN | 0.000 | 3 | 7 | ... | NaN | 98178 | 47.494 | -122.262 | 1860.000 | 8658.000 | 2015-02-24 | 178000.000 | 3800008 | 3198 |
| 8 | 5200087 | 4.000 | 2.500 | 2540.000 | 5001.000 | 2.000 | 0.000 | 0.000 | 3 | 9 | ... | 0.000 | 98108 | 47.542 | -122.302 | 2360.000 | 6834.000 | 2014-07-09 | 487000.000 | 5200087 | 21048 |
| 9 | 6200017 | 3.000 | 1.000 | 1340.000 | 21336.000 | 1.500 | 0.000 | 0.000 | 4 | 5 | ... | 0.000 | 98032 | 47.402 | -122.273 | 1340.000 | 37703.000 | 2014-11-12 | 281000.000 | 6200017 | 4330 |
10 rows × 23 columns
# Check the merged dataset
merged_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21597 entries, 0 to 21596 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id_details 21597 non-null int64 1 bedrooms 21597 non-null float64 2 bathrooms 21597 non-null float64 3 sqft_living 21597 non-null float64 4 sqft_lot 21597 non-null float64 5 floors 21597 non-null float64 6 waterfront 19206 non-null float64 7 view 21534 non-null float64 8 condition 21597 non-null int64 9 grade 21597 non-null int64 10 sqft_above 21597 non-null float64 11 sqft_basement 21145 non-null float64 12 yr_built 21597 non-null int64 13 yr_renovated 17749 non-null float64 14 zipcode 21597 non-null int64 15 lat 21597 non-null float64 16 long 21597 non-null float64 17 sqft_living15 21597 non-null float64 18 sqft_lot15 21597 non-null float64 19 date 21597 non-null object 20 price 21597 non-null float64 21 house_id 21597 non-null int64 22 id_sales 21597 non-null int64 dtypes: float64(15), int64(7), object(1) memory usage: 3.8+ MB
# Display the columns
merged_df.columns
Index(['id_details', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
'sqft_living15', 'sqft_lot15', 'date', 'price', 'house_id', 'id_sales'],
dtype='object')
Identify numerical and categorical columns Numerical columns: ['sqft_living', 'sqft_lot', , 'sqft_above', 'sqft_basement', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'price'] Categorical columns: ['id_details', 'bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition', 'grade','date', 'yr_built', 'yr_renovated', 'zipcode', 'house_id', 'id_sales']
Checked the merged dataset for NaN, missing nd zero values and determine the associated columns and generated breakdowns of NaN, missing, and zero values
# Check the merged dataset for NaN, missing nd zero values and determine the associated columns:
df = merged_df
def missing_values_breakdown(df):
# Create a summary DataFrame to hold the breakdown
summary = pd.DataFrame(index=df.columns)
# Count NaN values
summary['NaN Count'] = df.isna().sum()
# Count missing values, considering NaNs as missing
summary['Missing Count'] = summary['NaN Count'] + (df == 0).sum()
# Count zero values
summary['Zero Count'] = (df == 0).sum()
# Calculate the percentage of missing values for each column0
summary['% Missing'] = (summary['Missing Count'] / len(df)) * 100
# Calculate the percentage of NaN values for each column
summary['% NaN'] = (summary['NaN Count'] / len(df)) * 100
# Calculate the percentage of zero values for each column
summary['% Zero'] = (summary['Zero Count'] / len(df)) * 100
return summary
# Generate breakdown of NaN, missing, and zero values
breakdown = missing_values_breakdown(df)
print(breakdown)
NaN Count Missing Count Zero Count % Missing % NaN % Zero id_details 0 0 0 0.000 0.000 0.000 bedrooms 0 0 0 0.000 0.000 0.000 bathrooms 0 0 0 0.000 0.000 0.000 sqft_living 0 0 0 0.000 0.000 0.000 sqft_lot 0 0 0 0.000 0.000 0.000 floors 0 0 0 0.000 0.000 0.000 waterfront 2391 21451 19060 99.324 11.071 88.253 view 63 19485 19422 90.221 0.292 89.929 condition 0 0 0 0.000 0.000 0.000 grade 0 0 0 0.000 0.000 0.000 sqft_above 0 0 0 0.000 0.000 0.000 sqft_basement 452 13279 12827 61.485 2.093 59.393 yr_built 0 0 0 0.000 0.000 0.000 yr_renovated 3848 20853 17005 96.555 17.817 78.738 zipcode 0 0 0 0.000 0.000 0.000 lat 0 0 0 0.000 0.000 0.000 long 0 0 0 0.000 0.000 0.000 sqft_living15 0 0 0 0.000 0.000 0.000 sqft_lot15 0 0 0 0.000 0.000 0.000 date 0 0 0 0.000 0.000 0.000 price 0 0 0 0.000 0.000 0.000 house_id 0 0 0 0.000 0.000 0.000 id_sales 0 0 0 0.000 0.000 0.000
#NaN values are present in columns with categorical variables and one numerical (area)
# 'waterfront' and 'view' columns are categorical features indicating specific house characteristics. Replace NaNs with 0 if it implies "no waterfront" or "no view". Analyze if the majority of properties
# have a value of 0, and if so, consider treating these as a separate category.
# Replace NaN, missing, and zero values with 0 for selected columns
columns_to_fill = ['waterfront', 'view', 'sqft_basement', 'yr_renovated']
merged_df[columns_to_fill] = merged_df[columns_to_fill].fillna(0)
merged_df[columns_to_fill] = merged_df[columns_to_fill].replace(to_replace=np.nan, value=0)
merged_df[columns_to_fill] = merged_df[columns_to_fill].replace(to_replace=0, value=0)
# Convert 'waterfront', 'view', and 'yr_renovated' columns to int64
merged_df['waterfront'] = merged_df['waterfront'].astype('int64')
merged_df['view'] = merged_df['view'].astype('int64')
merged_df['yr_renovated'] = merged_df['yr_renovated'].astype('int64')
# Display the first few rows to verify changes
merged_df[['waterfront', 'view', 'sqft_basement', 'yr_renovated']].head()
| waterfront | view | sqft_basement | yr_renovated | |
|---|---|---|---|---|
| 0 | 0 | 0 | 0.000 | 0 |
| 1 | 0 | 0 | 0.000 | 0 |
| 2 | 0 | 0 | 900.000 | 0 |
| 3 | 0 | 0 | 0.000 | 0 |
| 4 | 0 | 0 | 420.000 | 0 |
#'yr_renovated' column has values multiplied by ten.
merged_df['yr_renovated'] = merged_df['yr_renovated'] / 10
# Check how many duplicated rows exist in the data frame
merged_df.duplicated().value_counts()
False 21597 Name: count, dtype: int64
# Check data types in data frame again
merged_df.dtypes
id_details int64 bedrooms float64 bathrooms float64 sqft_living float64 sqft_lot float64 floors float64 waterfront int64 view int64 condition int64 grade int64 sqft_above float64 sqft_basement float64 yr_built int64 yr_renovated int64 zipcode int64 lat float64 long float64 sqft_living15 float64 sqft_lot15 float64 date object price float64 house_id int64 id_sales int64 dtype: object
# type of first date entry
type(merged_df['date'][0])
pandas._libs.tslibs.timestamps.Timestamp
# change "date" dtype to datetime with format %Y/%m/%d
merged_df['date'] = pd.to_datetime(merged_df['date'])
Descriptive stats
# give a descriptive stats
merged_df.describe()
| id_details | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | ... | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | date | price | house_id | id_sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | ... | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597 | 21597.000 | 21597.000 | 21597.000 |
| mean | 4580474287.771 | 3.373 | 2.116 | 2080.322 | 15099.409 | 1.494 | 0.007 | 0.233 | 3.410 | 7.658 | ... | 68.758 | 98077.952 | 47.560 | -122.214 | 1986.620 | 12758.284 | 2014-10-29 04:20:38.171968512 | 540296.574 | 4580474287.771 | 10799.000 |
| min | 1000102.000 | 1.000 | 0.500 | 370.000 | 520.000 | 1.000 | 0.000 | 0.000 | 1.000 | 3.000 | ... | 0.000 | 98001.000 | 47.156 | -122.519 | 399.000 | 651.000 | 2014-05-02 00:00:00 | 78000.000 | 1000102.000 | 1.000 |
| 25% | 2123049175.000 | 3.000 | 1.750 | 1430.000 | 5040.000 | 1.000 | 0.000 | 0.000 | 3.000 | 7.000 | ... | 0.000 | 98033.000 | 47.471 | -122.328 | 1490.000 | 5100.000 | 2014-07-22 00:00:00 | 322000.000 | 2123049175.000 | 5400.000 |
| 50% | 3904930410.000 | 3.000 | 2.250 | 1910.000 | 7618.000 | 1.500 | 0.000 | 0.000 | 3.000 | 7.000 | ... | 0.000 | 98065.000 | 47.572 | -122.231 | 1840.000 | 7620.000 | 2014-10-16 00:00:00 | 450000.000 | 3904930410.000 | 10799.000 |
| 75% | 7308900490.000 | 4.000 | 2.500 | 2550.000 | 10685.000 | 2.000 | 0.000 | 0.000 | 4.000 | 8.000 | ... | 0.000 | 98118.000 | 47.678 | -122.125 | 2360.000 | 10083.000 | 2015-02-17 00:00:00 | 645000.000 | 7308900490.000 | 16198.000 |
| max | 9900000190.000 | 33.000 | 8.000 | 13540.000 | 1651359.000 | 3.500 | 1.000 | 4.000 | 5.000 | 13.000 | ... | 2015.000 | 98199.000 | 47.778 | -121.315 | 6210.000 | 871200.000 | 2015-05-27 00:00:00 | 7700000.000 | 9900000190.000 | 21597.000 |
| std | 2876735715.748 | 0.926 | 0.769 | 918.106 | 41412.637 | 0.540 | 0.082 | 0.765 | 0.651 | 1.173 | ... | 364.037 | 53.513 | 0.139 | 0.141 | 685.230 | 27274.442 | NaN | 367368.140 | 2876735715.748 | 6234.661 |
8 rows × 23 columns
A correlation matrix is given. The price column display correlation with grade, rooms and area columns.
# Let's give a correlation matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Compute the correlation matrix
correlation_matrix = merged_df.corr()
# Set up the matplotlib figure
plt.figure(figsize=(15, 10))
# Generate a heatmap using seaborn with adjustments for better readability
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=1, fmt=".2f", annot_kws={"size": 10, "color": "black"})
# Add title
plt.title('Correlation Heatmap of King County Home Sales Dataset')
# Display the heatmap
plt.show()
Plotted histograms for continuous columns.
# Let's plot histograms for each of the continuous columns in individual plots
# List of continuous numerical columns
continuous_columns = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'price']
for column in continuous_columns:
plt.figure(figsize=(10, 6))
plt.hist(df[column], bins=50, color='skyblue', edgecolor='black')
plt.title(f'Distribution of {column}')
plt.xlabel(column)
plt.ylabel('Frequency')
plt.show()
Plotted bar charts for categorical columns.
# List of categorical columns
categorical_columns = ['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition', 'grade', 'yr_built', 'yr_renovated', 'zipcode']
# Plot bar charts for categorical variables to show counts
for column in categorical_columns:
# Create a new figure
plt.figure(figsize=(12, 6))
# Plot bar chart for count of unique values in each categorical variable
merged_df[column].value_counts().plot(kind='bar', color='lightblue', edgecolor='black')
# Set the title and labels
plt.title(f'Count of {column}')
plt.xlabel(column)
plt.ylabel('Frequency')
# Display the plot
plt.show()
Price distribution by docation using plotly
# 1. Insight: Geographical Insight: Price Distribution by Location
import plotly.express as px
# Plot geographical price distribution
fig = px.scatter_mapbox(merged_df, lat='lat', lon='long', color='price',
size='price', color_continuous_scale=px.colors.cyclical.IceFire,
mapbox_style="carto-positron",
title='Geographical Distribution of Home Prices in King County')
fig.show()
fig.write_html('plot_price.html')
Geographical Insight: Price Distribution by Location using Folium
Geographical Insight: Price Distribution by Location using Folium
import folium
from folium.plugins import MarkerCluster
import pandas as pd
# Initialize the map centered around King County
map_king_county = folium.Map(location=[47.5, -122.2], zoom_start=9, tiles='cartodbpositron')
# Add price markers directly to the map without clustering to ensure visibility when zoomed out
for idx, row in merged_df.iterrows():
# Set color based on price: higher prices are in red, lower prices in blue
#color = 'red' if row['price'] > merged_df['price'].median() else 'blue'
color = 'red' if row['price'] > 3000000 else 'blue'
folium.CircleMarker(
location=(row['lat'], row['long']),
radius=row['price'] / 2e5, # Adjust radius for better visualization
color=color,
fill=True,
fill_color=color,
fill_opacity=0.6,
popup=folium.Popup(f"Price: ${row['price']:,}", parse_html=True)
).add_to(map_king_county)
# Save map as an HTML file
map_king_county.save('folium_price_distribution.html')
1. Insight: How does proximity to specific geographic features, such as waterfronts or central locations, influence house prices in King County?¶
Average Price Comparison: The average price of waterfront properties is significantly higher
1,717,214compared to non-waterfront properties532,286.This shows that proximity to a waterfront contributes greatly to property value.
Geographical Distribution: The scatter plot shows the geographical distribution of houses, with waterfront properties marked distinctly. You can observe that waterfront properties are typically located in specific regions near water bodies, and these tend to have higher prices, as reflected by the larger markers.
These visualizations support the hypothesis that proximity to water significantly influences property prices.
# 1. Insight: Hypothesis / Research Question
# How does proximity to specific geographic features,
# such as waterfronts or central locations, influence house prices in King County?
# Average Price Comparison: The average price of waterfront properties is significantly higher ($1,717,214) compared to non-waterfront properties ($532,286).
#This shows that proximity to a waterfront contributes greatly to property value.
# Geographical Distribution: The scatter plot shows the geographical distribution of houses, with waterfront
#properties marked distinctly. You can observe that waterfront properties are typically located in specific
#regions near water bodies, and these tend to have higher prices, as reflected by the larger markers.
# These visualizations support the hypothesis that proximity to water significantly influences property prices.
import matplotlib.pyplot as plt
import seaborn as sns
# Filter properties based on waterfront presence to explore the impact on price
waterfront_properties = merged_df[merged_df['waterfront'] == 1]
non_waterfront_properties = merged_df[merged_df['waterfront'] == 0]
# Calculate average prices for waterfront and non-waterfront properties
avg_price_waterfront = waterfront_properties['price'].mean()
avg_price_non_waterfront = non_waterfront_properties['price'].mean()
# Visualize the average price comparison for waterfront vs non-waterfront properties
plt.figure(figsize=(8, 6))
sns.barplot(x=['Waterfront', 'Non-Waterfront'], y=[avg_price_waterfront, avg_price_non_waterfront], palette='Blues')
plt.title('Average House Price: Waterfront vs. Non-Waterfront')
plt.ylabel('Average Price')
plt.show()
# Visualize geographical distribution of house prices with waterfront vs. non-waterfront differentiation
plt.figure(figsize=(12, 8))
sns.scatterplot(data=merged_df, x='long', y='lat', hue='waterfront', size='price', sizes=(20, 200), alpha=0.6, palette='cool')
plt.title('House Prices by Location (Longitude, Latitude) - Waterfront vs. Non-Waterfront')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Waterfront')
plt.show()
# Summary of price impact based on geographical location
price_impact_summary = {
'Average Price (Waterfront)': avg_price_waterfront,
'Average Price (Non-Waterfront)': avg_price_non_waterfront
}
price_impact_summary
{'Average Price (Waterfront)': 1717214.7260273972,
'Average Price (Non-Waterfront)': 532286.2219943126}
2. Insight: Renovations are very impactful across all grades but particularly beneficial for medium and high-grade properties.¶
% change between renovated and not renovated grade_categories: Low Grade: 18.20 - Medium Grade: 44.27 - High Grade: 52.66
# 2. Insight: Renovations are very impactful across all grades but particularly beneficial for medium and high-grade properties.
# Create a series to indicate if the house was renovated (1 if renovated, 0 otherwise) without modifying the original dataframe
was_renovated = merged_df['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)
# Create grade categories based on quantiles
grade_category = pd.qcut(merged_df['grade'], q=3, labels=['Low', 'Medium', 'High'])
# Combine data into a new DataFrame for grouping without changing the original dataframe
data_for_renovation_analysis = pd.DataFrame({
'was_renovated': was_renovated,
'grade_category': grade_category,
'price': merged_df['price']
})
# Group by grade category and renovation status to calculate average price
avg_price_by_grade_renovation = data_for_renovation_analysis.groupby(['grade_category', 'was_renovated'])['price'].mean().reset_index()
# Plot average price by grade category and renovation status using seaborn
plt.figure(figsize=(10, 6))
sns.barplot(data=avg_price_by_grade_renovation, x='grade_category', y='price', hue='was_renovated', palette='muted')
plt.title('Average House Price by Grade Category and Renovation Status')
plt.xlabel('Grade Category')
plt.ylabel('Average Price')
plt.legend(title='Renovated')
plt.show()
# Calculate the average price for renovated and non-renovated properties in each grade category
avg_price_renovated = data_for_renovation_analysis[data_for_renovation_analysis['was_renovated'] == 1].groupby('grade_category')['price'].mean()
avg_price_non_renovated = data_for_renovation_analysis[data_for_renovation_analysis['was_renovated'] == 0].groupby('grade_category')['price'].mean()
# Calculate the percentage increase in average price for renovated properties compared to non-renovated ones for each grade category
price_increase_percentage = ((avg_price_renovated - avg_price_non_renovated) / avg_price_non_renovated) * 100
# Display the percentage increase for each grade category
price_increase_percentage
grade_category Low 18.203 Medium 44.277 High 52.656 Name: price, dtype: float64
3. Insight: Does the number of floors in a property influence its sale price, and how does this effect vary across different property grades?¶
- Properties with more floors generally have a higher average price.
- High-grade properties consistently have a higher average price regardless of the number of floors.
- For low and medium-grade properties, the average price increases with the number of floors, but at a slower rate compared to high-grade properties.
# 3. Insight: Does the number of floors in a property influence its sale price,
# and how does this effect vary across different property grades?
# Properties with more floors generally have a higher average price.
# High-grade properties consistently have a higher average price regardless of the number of floors.
# For low and medium-grade properties, the average price increases with the number of floors,
#but at a slower rate compared to high-grade properties.
# Create grade categories based on quantiles without modifying the dataframe
grade_category = pd.qcut(merged_df['grade'], q=3, labels=['Low', 'Medium', 'High'])
# Combine data into a new DataFrame for analysis without modifying the original dataframe
data_for_floors_analysis = pd.DataFrame({
'floors': merged_df['floors'],
'grade_category': grade_category,
'price': merged_df['price']
})
# Group by number of floors and grade category to calculate the average price
avg_price_by_floors_grade = data_for_floors_analysis.groupby(['floors', 'grade_category'])['price'].mean().reset_index()
# Plot average price by number of floors and grade category using seaborn
sns.set(style="whitegrid")
plt.figure(figsize=(12, 8))
sns.barplot(data=avg_price_by_floors_grade, x='floors', y='price', hue='grade_category', palette='muted')
plt.title('Average House Price by Number of Floors and Grade Category')
plt.xlabel('Number of Floors')
plt.ylabel('Average Price')
plt.legend(title='Grade Category')
plt.show()
1. Recommendation for a Home Seller Client¶
- Top 10% Central Houses: These houses are in the highest price bracket, typically large with higher numbers of bedrooms and bathrooms.
- Average-Priced Houses: These properties fall into a more average price range, suitable for discreet purchases.
# Recommendations for a Home Seller
# 1. Top 10% Central Houses: These houses are in the highest price bracket,
#typically large with higher numbers of bedrooms and bathrooms.
#Average-Priced Houses: These properties fall into a more average price range, suitable for discreet purchases.
# Define price threshold for top 10% of properties
top_10_percent_price_threshold = merged_df['price'].quantile(0.90)
# Filter for central houses in the top 10% price range
top_10_percent_houses = merged_df[merged_df['price'] >= top_10_percent_price_threshold]
# Filter for average-priced houses (middle 50%)
avg_price_thresholds = merged_df['price'].quantile([0.25, 0.75])
average_outskirt_houses = merged_df[
(merged_df['price'] >= avg_price_thresholds.iloc[0]) &
(merged_df['price'] <= avg_price_thresholds.iloc[1])
]
# Summary statistics for top 10% central houses
print("Summary of Top 10% Central Houses:")
print(top_10_percent_houses.describe())
# Summary statistics for average-priced houses
print("\nSummary of Average-Priced Outskirt Houses:")
print(average_outskirt_houses.describe())
# Plot distribution of prices for top 10% central houses and average-priced houses
plt.figure(figsize=(14, 6))
# Top 10% houses price distribution
sns.histplot(top_10_percent_houses['price'], bins=30, kde=True, color='blue', label='Top 10% Central Houses', alpha=0.6)
# Average-priced houses price distribution
sns.histplot(average_outskirt_houses['price'], bins=30, kde=True, color='green', label='Average-Priced Houses', alpha=0.6)
plt.title('Price Distribution: Top 10% Central Houses vs. Average Outskirt Houses')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.legend()
plt.show()
Summary of Top 10% Central Houses:
id_details bedrooms bathrooms sqft_living sqft_lot floors \
count 2161.000 2161.000 2161.000 2161.000 2161.000 2161.000
mean 4246407508.064 4.018 3.024 3566.449 23463.626 1.789
min 31000165.000 1.000 1.000 1140.000 609.000 1.000
25% 1818800235.000 3.000 2.500 2820.000 6350.000 1.500
50% 3629921000.000 4.000 3.000 3450.000 10260.000 2.000
75% 6790830060.000 4.000 3.500 4133.000 17833.000 2.000
max 9839301165.000 10.000 8.000 13540.000 982998.000 3.500
std 2844364798.400 0.892 0.849 1138.207 60856.669 0.485
waterfront view condition grade ... yr_renovated zipcode \
count 2161.000 2161.000 2161.000 2161.000 ... 2161.000 2161.000
mean 0.049 1.011 3.460 9.512 ... 178.561 98069.752
min 0.000 0.000 1.000 6.000 ... 0.000 98003.000
25% 0.000 0.000 3.000 9.000 ... 0.000 98027.000
50% 0.000 0.000 3.000 9.000 ... 0.000 98056.000
75% 0.000 2.000 4.000 10.000 ... 0.000 98112.000
max 1.000 4.000 5.000 13.000 ... 2015.000 98199.000
std 0.216 1.445 0.697 1.221 ... 570.333 54.504
lat long sqft_living15 sqft_lot15 \
count 2161.000 2161.000 2161.000 2161.000
mean 47.618 -122.217 2950.284 17681.917
min 47.260 -122.514 980.000 977.000
25% 47.572 -122.299 2360.000 6011.000
50% 47.627 -122.218 2910.000 10005.000
75% 47.659 -122.140 3520.000 15720.000
max 47.776 -121.731 6210.000 871200.000
std 0.065 0.119 836.999 38647.906
date price house_id id_sales
count 2161 2161.000 2161.000 2161.000
mean 2014-10-28 23:36:40.647848192 1344848.380 4246407508.064 11190.145
min 2014-05-02 00:00:00 887000.000 31000165.000 6.000
25% 2014-07-14 00:00:00 975000.000 1818800235.000 5585.000
50% 2014-10-15 00:00:00 1160000.000 3629921000.000 11454.000
75% 2015-02-25 00:00:00 1490000.000 6790830060.000 16791.000
max 2015-05-27 00:00:00 7700000.000 9839301165.000 21591.000
std NaN 588034.461 2844364798.400 6414.441
[8 rows x 23 columns]
Summary of Average-Priced Outskirt Houses:
id_details bedrooms bathrooms sqft_living sqft_lot floors \
count 10832.000 10832.000 10832.000 10832.000 10832.000 10832.000
mean 4637339265.170 3.318 2.064 1930.545 14322.594 1.495
min 1200021.000 1.000 0.750 410.000 600.000 1.000
25% 2223831514.000 3.000 1.750 1450.000 4652.750 1.000
50% 3930200240.000 3.000 2.250 1870.000 7204.500 1.000
75% 7304301121.500 4.000 2.500 2340.000 10123.500 2.000
max 9895000040.000 33.000 7.500 5461.000 1074218.000 3.500
std 2857178737.929 0.914 0.650 646.870 36087.762 0.557
waterfront view condition grade ... yr_renovated zipcode \
count 10832.000 10832.000 10832.000 10832.000 ... 10832.000 10832.000
mean 0.002 0.137 3.383 7.495 ... 52.088 98081.387
min 0.000 0.000 1.000 4.000 ... 0.000 98001.000
25% 0.000 0.000 3.000 7.000 ... 0.000 98034.000
50% 0.000 0.000 3.000 7.000 ... 0.000 98074.000
75% 0.000 0.000 4.000 8.000 ... 0.000 98118.000
max 1.000 4.000 5.000 11.000 ... 2015.000 98199.000
std 0.043 0.560 0.624 0.807 ... 318.044 50.283
lat long sqft_living15 sqft_lot15 \
count 10832.000 10832.000 10832.000 10832.000
mean 47.590 -122.214 1887.869 12489.243
min 47.156 -122.519 399.000 748.000
25% 47.512 -122.338 1500.000 4800.000
50% 47.606 -122.237 1820.000 7266.000
75% 47.697 -122.121 2230.000 9730.000
max 47.778 -121.321 4362.000 438213.000
std 0.130 0.147 508.725 26810.218
date price house_id id_sales
count 10832 10832.000 10832.000 10832.000
mean 2014-10-29 20:32:12.939438592 462139.484 4637339265.170 10955.775
min 2014-05-02 00:00:00 322000.000 1200021.000 2.000
25% 2014-07-22 00:00:00 385000.000 2223831514.000 5524.250
50% 2014-10-16 00:00:00 450000.000 3930200240.000 10930.500
75% 2015-02-18 00:00:00 536000.000 7304301121.500 16464.250
max 2015-05-24 00:00:00 645000.000 9895000040.000 21597.000
std NaN 90223.490 2857178737.929 6281.436
[8 rows x 23 columns]
2. Recommendation for a Home Seller Client¶
- Top 10% Central Houses tend to have slightly better condition on average compared to the average-priced houses in the outskirts. This might imply less need for repairs or renovations for top-value properties.
# Recommendations for a Home Seller
# 2. Condition of Top 10% Central Houses vs. Average Outskirt Houses
#Top 10% Central Houses tend to have slightly better condition on average compared to the average-priced houses
#in the outskirts. This might imply less need for repairs or renovations for top-value properties.
# Average condition of top 10% central houses
avg_condition_top_10 = top_10_percent_houses['condition'].mean()
# Average condition of average-priced outskirt houses
avg_condition_outskirts = average_outskirt_houses['condition'].mean()
print(f"Average Condition (Top 10% Central Houses): {avg_condition_top_10:.2f}")
print(f"Average Condition (Average-Priced Outskirt Houses): {avg_condition_outskirts:.2f}")
Average Condition (Top 10% Central Houses): 3.46 Average Condition (Average-Priced Outskirt Houses): 3.38
3. Recommendation for a Home Seller Client¶
- Number of Bathrooms in Top 10% Central Houses vs. Average Outskirt Houses:
- Top 10% Central Houses generally have more bathrooms compared to average-priced outskirt houses.
# Recommendations for a Home Seller
# 3. Number of Bathrooms in Top 10% Central Houses vs. Average Outskirt Houses
#Top 10% Central Houses generally have more bathrooms compared to average-priced outskirt houses.
# Average number of bathrooms for top 10% central houses
avg_bathrooms_top_10 = top_10_percent_houses['bathrooms'].mean()
# Average number of bathrooms for average-priced outskirt houses
avg_bathrooms_outskirts = average_outskirt_houses['bathrooms'].mean()
print(f"Average Number of Bathrooms (Top 10% Central Houses): {avg_bathrooms_top_10:.2f}")
print(f"Average Number of Bathrooms (Average-Priced Outskirt Houses): {avg_bathrooms_outskirts:.2f}")
Average Number of Bathrooms (Top 10% Central Houses): 3.02 Average Number of Bathrooms (Average-Priced Outskirt Houses): 2.06
price_threshold = df['price'].quantile(0.90)
central_houses = df[df['price'] >= price_threshold]
plt.figure(figsize=(10, 6))
sns.histplot(central_houses['price'], bins=30, kde=True, color='blue')
plt.title('Price Distribution for Top 10% High-Value Central Properties')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
import seaborn as sns
# Filter for high-grade and renovated properties
high_grade_renovated = central_houses[(central_houses['grade'] > df['grade'].mean()) & (central_houses['yr_renovated'] > 0)]
# Create scatter plot with corrected renovation years as a categorical hue
plt.figure(figsize=(10, 6))
sns.scatterplot(data=high_grade_renovated, x='grade', y='price', hue='yr_renovated', palette='cool', size='price', sizes=(20, 200), alpha=0.7)
plt.title('High-Grade and Renovated Properties')
plt.xlabel('Grade')
plt.ylabel('Price')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Year Renovated')
plt.show()
# Dealing with outliers
# It's crucial to identify if there are any outliers in the price data.
# We can use several techniques to do this: Boxplot Visualization:
# Boxplots are great tools for visualizing potential outliers
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.boxplot(data=merged_df, x='price')
plt.title('Boxplot of House Prices')
plt.xlabel('Price')
plt.show()
#The points that lie beyond the "whiskers" of the boxplot can be considered as potential outliers.
# Quantile-based Analysis: Calculate specific quantiles to identify extreme values. You can define thresholds based on these percentiles.
q1 = df['price'].quantile(0.25)
q3 = df['price'].quantile(0.75)
iqr = q3 - q1
lower_bound = max(0, q1 - 1.5 * iqr)
upper_bound = q3 + 1.5 * iqr
print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
# Outliers based on the IQR method
outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]
print(f"Number of Outliers: {len(outliers)}")
Lower Bound: 0, Upper Bound: 1129500.0 Number of Outliers: 1158
# Cap outliers for analysis only, without creating a new column
price_capped = df['price'].clip(lower=lower_bound, upper=upper_bound)
# Plot the capped price distribution without modifying the original DataFrame
plt.figure(figsize=(10, 6))
sns.histplot(price_capped, kde=True)
plt.title('Price Distribution After Capping Outliers')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
# Plot the distribution of prices after handling outliers
plt.figure(figsize=(10, 6))
sns.histplot(merged_df['price'], kde=True)
plt.title('Price Distribution After Handling Outliers')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
import numpy as np
# Apply log transformation to price
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Apply log transformation to 'price' for plotting only
log_price = np.log(df['price'] + 1) # Adding 1 to avoid log(0)
# Plot the log-transformed price distribution without adding a new column to the dataframe
plt.figure(figsize=(10, 6))
sns.histplot(log_price, kde=True)
plt.title('Log-Transformed Price Distribution')
plt.xlabel('Log(Price)')
plt.ylabel('Frequency')
plt.show()
#Prices are skewed right and some outliers lies above 1M
print(merged_df['price'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(merged_df['price'], color='g', bins=140, hist_kws={'alpha': 0.4});
count 21597.000 mean 540296.574 std 367368.140 min 78000.000 25% 322000.000 50% 450000.000 75% 645000.000 max 7700000.000 Name: price, dtype: float64
pip install nbconvert
Note: you may need to restart the kernel to use updated packages.Requirement already satisfied: nbconvert in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (7.16.4) Requirement already satisfied: beautifulsoup4 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (4.12.3) Requirement already satisfied: bleach!=5.0.0 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (6.2.0) Requirement already satisfied: defusedxml in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (0.7.1) Requirement already satisfied: jinja2>=3.0 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (3.1.4) Requirement already satisfied: jupyter-core>=4.7 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (5.7.2) Requirement already satisfied: jupyterlab-pygments in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (0.3.0) Requirement already satisfied: markupsafe>=2.0 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (3.0.2) Requirement already satisfied: mistune<4,>=2.0.3 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (3.0.2) Requirement already satisfied: nbclient>=0.5.0 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (0.10.0) Requirement already satisfied: nbformat>=5.7 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (5.10.4) Requirement already satisfied: packaging in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (24.2) Requirement already satisfied: pandocfilters>=1.4.1 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (1.5.1) Requirement already satisfied: pygments>=2.4.1 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (2.18.0) Requirement already satisfied: tinycss2 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (1.4.0) Requirement already satisfied: traitlets>=5.1 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbconvert) (5.14.3) Requirement already satisfied: webencodings in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from bleach!=5.0.0->nbconvert) (0.5.1) Requirement already satisfied: platformdirs>=2.5 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from jupyter-core>=4.7->nbconvert) (4.3.6) Requirement already satisfied: pywin32>=300 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from jupyter-core>=4.7->nbconvert) (308) Requirement already satisfied: jupyter-client>=6.1.12 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbclient>=0.5.0->nbconvert) (8.6.3) Requirement already satisfied: fastjsonschema>=2.15 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbformat>=5.7->nbconvert) (2.20.0) Requirement already satisfied: jsonschema>=2.6 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from nbformat>=5.7->nbconvert) (4.23.0) Requirement already satisfied: soupsieve>1.2 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from beautifulsoup4->nbconvert) (2.6) Requirement already satisfied: attrs>=22.2.0 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (24.2.0) Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (2024.10.1) Requirement already satisfied: referencing>=0.28.4 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (0.35.1) Requirement already satisfied: rpds-py>=0.7.1 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (0.21.0) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (2.9.0.post0) Requirement already satisfied: pyzmq>=23.0 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (26.2.0) Requirement already satisfied: tornado>=6.2 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (6.4.1) Requirement already satisfied: six>=1.5 in c:\users\onur_\bootcamp\eda_project\.venv\lib\site-packages (from python-dateutil>=2.8.2->jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (1.16.0)
[notice] A new release of pip available: 22.3.1 -> 24.3.1 [notice] To update, run: python.exe -m pip install --upgrade pip
$ jupyter nbconvert --to html mynotebook.ipynb
Cell In[82], line 1 jupyter nbconvert --to html mynotebook.ipynb ^ SyntaxError: invalid syntax